library(dplyr)
# For Model fitting
library(lme4)
library(purrr)
# For diagnostics
library(performance)
# For adding new columns
library(tibble)
library(caret)# Load data
sys.source("./codes/scripts/code_join_data_full_dataset.R", envir = knitr::knit_global())# Load functions
## Models
sys.source("./codes/functions/functions_models.R", envir = knitr::knit_global())Select performance and traits variables
Transform variables to the dataset
Back transformation
Even though you’ve done a statistical test on a transformed variable, such as the log of fish abundance, it is not a good idea to report your means, standard errors, etc. in transformed units. A graph that showed that the mean of the log of fish per 75 meters of stream was 1.044 would not be very informative for someone who can’t do fractional exponents in their head. Instead, you should back-transform your results. This involves doing the opposite of the mathematical function you used in the data transformation. For the log transformation, you would back-transform by raising 10 to the power of your number. For example, the log transformed data above has a mean of 1.044 and a 95% confidence interval of ±0.344 log-transformed fish. The back-transformed mean would be 101.044=11.1 fish. The upper confidence limit would be 10(1.044+0.344)=24.4 fish, and the lower confidence limit would be 10(1.044-0.344)=5.0 fish. Note that the confidence interval is not symmetrical; the upper limit is 13.3 fish above the mean, while the lower limit is 6.1 fish below the mean. Also note that you can’t just back-transform the confidence interval and add or subtract that from the back-transformed mean; you can’t take 100.344 and add or subtract that.
Square-root transformation. This consists of taking the square root of each observation. The back transformation is to square the number. If you have negative numbers, you can’t take the square root; you should add a constant to each number to make them all positive.
People often use the square-root transformation when the variable is a count of something, such as bacterial colonies per petri dish, blood cells going through a capillary per minute, mutations per generation, etc.
# Box-Cox transformation was previously run
data_for_models_transformed <-
data_for_models %>%
mutate(
# Plant's performance
total_biomass_sqrt = sqrt(total_biomass),
above_biomass_sqrt = sqrt(above_biomass),
below_biomass_log = log(below_biomass),
agr_log = log(agr),
# NO TRANSFORMATION variable already in log-log
rgr = rgr,
# NO TRANSFORMATION variable already in log-log
rgr_slope = rgr_slope,
# Traits
amax_log = log(amax),
gs_sqrt = sqrt(gs),
wue_log = log(wue),
# d13 and d15 where not transformed because the data has negative values
pnue_log = log(data_for_models$pnue),
# Covariate
init_height = log(init_height)) %>%
# Remove original variables (non-transformed)
dplyr::select(spcode, treatment, nfixer, init_height, everything(),
-c(5:8,11:13,16)) Models: Questions 1 and 2
\[response\sim treatment*fixer\ + initial\ height + random( 1|\ specie)\]
# Take response variables' names
response_vars_q1_q2 <-
set_names(names(data_for_models_transformed)[5:(ncol(data_for_models_transformed))])#data_for_models_transformed[data_for_models_transformed$id == 48,]
models_list_q1_q2 <- map(response_vars_q1_q2, ~ mixed_model_1(response = .x,
data = data_for_models_transformed))Models Nodule count
- Chapter 9 Mixed models in ecology check glmmML package for count data
- GOOD ref https://www.dataquest.io/blog/tutorial-poisson-regression-in-r/
- #https://www.flutterbys.com.au/stats/tut/tut11.2a.html
# Load data
# This step was done like this because I am working with a subset of the data
# source cleaned data
source("./codes/scripts/code_clean_data_nodules.R")
# Delete unused variables
data_nodules_cleaned <-
data_nodules_cleaned %>%
# add id to rownames for keep track of the rows
column_to_rownames("id") %>%
dplyr::select(spcode,treatment, everything())# Boxcox for nodule count
BoxCoxTrans(data_nodules_cleaned$number_of_root_nodulation) Box-Cox Transformation
48 data points used to estimate Lambda
Input data summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
7.00 26.75 49.50 65.40 104.50 167.00
Largest/Smallest: 23.9
Sample Skewness: 0.689
Estimated Lambda: 0.1
With fudge factor, Lambda = 0 will be used for transformations
data_nodules_cleaned spcode treatment number_of_root_nodulation
5 ec ambientrain 25
6 ec ambientrain 30
7 ec ambientrain 7
8 ec ambientrain 35
9 ec ambientrain_water_nutrients 11
10 ec ambientrain_water_nutrients 49
11 ec ambientrain_water_nutrients 29
12 ec ambientrain_water_nutrients 57
13 ec ambientrain_water 48
14 ec ambientrain_water 17
15 ec ambientrain_water 55
16 ec ambientrain_water 27
17 ec ambientrain_nutrients 29
18 ec ambientrain_nutrients 30
19 ec ambientrain_nutrients 22
20 ec ambientrain_nutrients 26
105 gs ambientrain 20
106 gs ambientrain 27
107 gs ambientrain 62
108 gs ambientrain 26
109 gs ambientrain_water_nutrients 26
110 gs ambientrain_water_nutrients 72
111 gs ambientrain_water_nutrients 58
112 gs ambientrain_water_nutrients 62
113 gs ambientrain_water 16
114 gs ambientrain_water 29
115 gs ambientrain_water 18
116 gs ambientrain_water 18
117 gs ambientrain_nutrients 42
118 gs ambientrain_nutrients 69
119 gs ambientrain_nutrients 41
120 gs ambientrain_nutrients 50
125 dr ambientrain 141
126 dr ambientrain 133
127 dr ambientrain 113
128 dr ambientrain 142
129 dr ambientrain_water_nutrients 165
130 dr ambientrain_water_nutrients 129
131 dr ambientrain_water_nutrients 163
132 dr ambientrain_water_nutrients 167
133 dr ambientrain_water 104
134 dr ambientrain_water 114
135 dr ambientrain_water 112
136 dr ambientrain_water 106
137 dr ambientrain_nutrients 104
138 dr ambientrain_nutrients 94
139 dr ambientrain_nutrients 116
140 dr ambientrain_nutrients 103
nodule_count_lab nodule_mass_in_the_lab average_nodule_weight
5 7 0.011 0.001571429
6 5 0.052 0.010400000
7 4 0.023 0.005750000
8 8 0.021 0.002625000
9 3 0.047 0.015666667
10 14 0.042 0.003000000
11 14 0.068 0.004857143
12 9 0.055 0.006111111
13 14 0.036 0.002571429
14 4 0.043 0.010750000
15 9 0.038 0.004222222
16 10 0.024 0.002400000
17 10 0.114 0.011400000
18 9 0.146 0.016222222
19 6 0.022 0.003666667
20 5 0.073 0.014600000
105 8 0.016 0.002000000
106 13 0.027 0.002076923
107 11 0.008 0.000727273
108 7 0.017 0.002428571
109 10 0.022 0.002200000
110 7 0.015 0.002142857
111 9 0.028 0.003111111
112 9 0.026 0.002888889
113 6 0.025 0.004166667
114 8 0.027 0.003375000
115 6 0.032 0.005333333
116 8 0.015 0.001875000
117 9 0.041 0.004555556
118 9 0.012 0.001333333
119 11 0.027 0.002454545
120 10 0.008 0.000800000
125 16 0.012 0.000750000
126 12 0.013 0.001083333
127 18 0.031 0.001722222
128 12 0.014 0.001166667
129 18 0.018 0.001000000
130 14 0.010 0.000714286
131 17 0.020 0.001176471
132 17 0.015 0.000882353
133 11 0.013 0.001181818
134 15 0.010 0.000666667
135 13 0.022 0.001692308
136 9 0.008 0.000888889
137 16 0.012 0.000750000
138 10 0.018 0.001800000
139 11 0.019 0.001727273
140 13 0.026 0.002000000
estimated_nodule_mass_per_plant init_height
5 0.039 17.0
6 0.312 25.0
7 0.040 14.0
8 0.092 16.0
9 0.172 20.5
10 0.147 21.0
11 0.141 24.0
12 0.348 25.5
13 0.123 20.0
14 0.183 20.0
15 0.232 23.0
16 0.065 24.5
17 0.331 24.5
18 0.487 13.5
19 0.081 17.0
20 0.380 22.0
105 0.040 12.5
106 0.056 13.0
107 0.045 20.0
108 0.063 19.0
109 0.057 17.5
110 0.154 17.5
111 0.180 16.5
112 0.179 14.5
113 0.067 15.5
114 0.098 16.0
115 0.096 15.5
116 0.034 10.5
117 0.191 12.5
118 0.092 17.5
119 0.101 13.5
120 0.040 21.0
125 0.106 9.5
126 0.144 11.5
127 0.195 11.0
128 0.166 11.0
129 0.165 11.0
130 0.092 8.5
131 0.192 10.0
132 0.147 12.5
133 0.123 10.0
134 0.076 7.0
135 0.190 9.0
136 0.094 9.5
137 0.078 10.5
138 0.169 8.5
139 0.200 11.0
140 0.206 9.5
m4 lmer gaussian
lmer_gaussian <- lmer(number_of_root_nodulation ~ treatment + init_height +
(1 |spcode),
data = data_nodules_cleaned)lmer_gaussian_log <- lmer(log(number_of_root_nodulation) ~ treatment + init_height +
(1 |spcode),
data = data_nodules_cleaned)m5 glmer poisson
glmer_poisson <- glmer(number_of_root_nodulation ~ treatment + init_height +
(1 |spcode), family = "poisson",
data = data_nodules_cleaned)models_list_nodule_count <- list(lmer_gaussian, lmer_gaussian_log, glmer_poisson)
names(models_list_nodule_count) <- c("lmer_gaussian",
"lmer_gaussian_log",
"glmer_poisson")Mycorrhizal colonization
I decided not to include it, because I want to focus on Nfixing vrs non-Fixing,
also I don't trust on the dataModels: Questions 3
\[performance\sim treatment*\ fixer*\ scale(log(trait)\ + initial\ height + random( 1|\ specie)\]
# This funtion takes two columns a create a model formula with all the possible
# combinations
model_combinations_formulas <- function(y_var, x_var){
variables <- crossing(y_var, x_var)
pattern <- c('\\+.*|nfixer|treatment|[[:punct:]]| ')
# Model
models <- paste0(variables$y_var,"~treatment*nfixer*",
variables$x_var,"+init_height+(1|spcode)")
formulas <- map(models, as.formula)
# Add name to the list
names(formulas) <- stringr::str_replace_all(formulas, pattern, replacement = '')
return(formulas)
}Scale preditors
data_for_models_transformed_scaled <-
data_for_models_transformed %>%
# test for being sure that I select the traits
#select(c(4,7,8,13:16))
# scale only the predictors traits
mutate(across(c(4,7,8,13:16), scale))# Select traits (x_vars)
traits_names <-
set_names(names(data_for_models_transformed_scaled)
[c(7,8,13:16)])
traits_names d13c d15n amax_log gs_sqrt wue_log pnue_log
"d13c" "d15n" "amax_log" "gs_sqrt" "wue_log" "pnue_log"
# Select plants performance vars (y_vars)
performance_names <-
set_names(names(data_for_models_transformed_scaled)[c(5,6,9:12)])
performance_names rgr rgr_slope total_biomass_sqrt
"rgr" "rgr_slope" "total_biomass_sqrt"
above_biomass_sqrt below_biomass_log agr_log
"above_biomass_sqrt" "below_biomass_log" "agr_log"
models_lmer_formulas <- model_combinations_formulas(performance_names, traits_names)
length(models_lmer_formulas)[1] 36
models_list_q3 <- map(models_lmer_formulas,
~ lmer(.x, data = data_for_models_transformed_scaled))Validation plots
Collinearity
map(models_list_q1_q2, check_collinearity)$rgr
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.97 1.99 0.25
nfixer 1.04 1.02 0.96
init_height 1.09 1.04 0.92
treatment:nfixer 3.92 1.98 0.26
$rgr_slope
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.97 1.99 0.25
nfixer 1.05 1.02 0.95
init_height 1.09 1.04 0.92
treatment:nfixer 3.93 1.98 0.25
$d13c
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.92 1.98 0.25
nfixer 1.21 1.10 0.83
init_height 1.08 1.04 0.93
treatment:nfixer 4.29 2.07 0.23
$d15n
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.87 1.97 0.26
nfixer 1.73 1.32 0.58
init_height 1.07 1.03 0.94
Moderate Correlation
Term VIF Increased SE Tolerance
treatment:nfixer 5.53 2.35 0.18
$total_biomass_sqrt
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.94 1.98 0.25
nfixer 1.16 1.07 0.87
init_height 1.08 1.04 0.92
treatment:nfixer 4.17 2.04 0.24
$above_biomass_sqrt
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.96 1.99 0.25
nfixer 1.07 1.03 0.94
init_height 1.09 1.04 0.92
treatment:nfixer 3.97 1.99 0.25
$below_biomass_log
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.92 1.98 0.26
nfixer 1.25 1.12 0.80
init_height 1.08 1.04 0.93
treatment:nfixer 4.38 2.09 0.23
$agr_log
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.97 1.99 0.25
nfixer 1.05 1.03 0.95
init_height 1.09 1.04 0.92
treatment:nfixer 3.94 1.98 0.25
$amax_log
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.96 1.99 0.25
nfixer 1.07 1.04 0.93
init_height 1.09 1.04 0.92
treatment:nfixer 3.98 2.00 0.25
$gs_sqrt
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.85 1.96 0.26
nfixer 2.04 1.43 0.49
init_height 1.06 1.03 0.94
Moderate Correlation
Term VIF Increased SE Tolerance
treatment:nfixer 6.28 2.51 0.16
$wue_log
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.90 1.97 0.26
nfixer 1.36 1.17 0.74
init_height 1.07 1.04 0.93
treatment:nfixer 4.64 2.15 0.22
$pnue_log
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 3.90 1.98 0.26
nfixer 1.33 1.15 0.75
init_height 1.08 1.04 0.93
treatment:nfixer 4.58 2.14 0.22
map(models_list_nodule_count, check_collinearity)$lmer_gaussian
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 1.06 1.03 0.94
init_height 1.06 1.03 0.94
$lmer_gaussian_log
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 1.06 1.03 0.95
init_height 1.06 1.03 0.95
$glmer_poisson
# Check for Multicollinearity
Low Correlation
Term VIF Increased SE Tolerance
treatment 1.08 1.04 0.93
init_height 1.08 1.04 0.93
#Warning: Model has interaction terms. VIFs might be inflated. You may check
#multicollinearity among predictors of a model without interaction terms.
#map(models_list_q3, check_mul)Bolker’s plots
bolker_validation <- function(model) {
a <- plot(model, type = c("p", "smooth"))
## heteroscedasticity
b <- plot(model,sqrt(abs(resid(.))) ~ fitted(.), type = c("p", "smooth"))
cowplot::plot_grid(a,b,nrow = 1)
}Models for questions 1,2
map(models_list_q1_q2, bolker_validation)$rgr
$rgr_slope
$d13c
$d15n
$total_biomass_sqrt
$above_biomass_sqrt
$below_biomass_log
$agr_log
$amax_log
$gs_sqrt
$wue_log
$pnue_log
Models for nodule count
map(models_list_nodule_count, bolker_validation)$lmer_gaussian
$lmer_gaussian_log
$glmer_poisson
Models for question 3
map(models_list_q3, bolker_validation)$`abovebiomasssqrt~amaxlog`
$`abovebiomasssqrt~d13c`
$`abovebiomasssqrt~d15n`
$`abovebiomasssqrt~gssqrt`
$`abovebiomasssqrt~pnuelog`
$`abovebiomasssqrt~wuelog`
$`agrlog~amaxlog`
$`agrlog~d13c`
$`agrlog~d15n`
$`agrlog~gssqrt`
$`agrlog~pnuelog`
$`agrlog~wuelog`
$`belowbiomasslog~amaxlog`
$`belowbiomasslog~d13c`
$`belowbiomasslog~d15n`
$`belowbiomasslog~gssqrt`
$`belowbiomasslog~pnuelog`
$`belowbiomasslog~wuelog`
$`rgr~amaxlog`
$`rgr~d13c`
$`rgr~d15n`
$`rgr~gssqrt`
$`rgr~pnuelog`
$`rgr~wuelog`
$`rgrslope~amaxlog`
$`rgrslope~d13c`
$`rgrslope~d15n`
$`rgrslope~gssqrt`
$`rgrslope~pnuelog`
$`rgrslope~wuelog`
$`totalbiomasssqrt~amaxlog`
$`totalbiomasssqrt~d13c`
$`totalbiomasssqrt~d15n`
$`totalbiomasssqrt~gssqrt`
$`totalbiomasssqrt~pnuelog`
$`totalbiomasssqrt~wuelog`
Performance package
Models for questions 1,2
map(models_list_q1_q2, check_model)$rgr
$rgr_slope
$d13c
$d15n
$total_biomass_sqrt
$above_biomass_sqrt
$below_biomass_log
$agr_log
$amax_log
$gs_sqrt
$wue_log
$pnue_log
Models for nodule count
map(models_list_nodule_count, check_model)$lmer_gaussian
$lmer_gaussian_log
$glmer_poisson
Models for question 3
map(models_list_q3, check_model)$`abovebiomasssqrt~amaxlog`
$`abovebiomasssqrt~d13c`
$`abovebiomasssqrt~d15n`
$`abovebiomasssqrt~gssqrt`
$`abovebiomasssqrt~pnuelog`
$`abovebiomasssqrt~wuelog`
$`agrlog~amaxlog`
$`agrlog~d13c`
$`agrlog~d15n`
$`agrlog~gssqrt`
$`agrlog~pnuelog`
$`agrlog~wuelog`
$`belowbiomasslog~amaxlog`
$`belowbiomasslog~d13c`
$`belowbiomasslog~d15n`
$`belowbiomasslog~gssqrt`
$`belowbiomasslog~pnuelog`
$`belowbiomasslog~wuelog`
$`rgr~amaxlog`
$`rgr~d13c`
$`rgr~d15n`
$`rgr~gssqrt`
$`rgr~pnuelog`
$`rgr~wuelog`
$`rgrslope~amaxlog`
$`rgrslope~d13c`
$`rgrslope~d15n`
$`rgrslope~gssqrt`
$`rgrslope~pnuelog`
$`rgrslope~wuelog`
$`totalbiomasssqrt~amaxlog`
$`totalbiomasssqrt~d13c`
$`totalbiomasssqrt~d15n`
$`totalbiomasssqrt~gssqrt`
$`totalbiomasssqrt~pnuelog`
$`totalbiomasssqrt~wuelog`
Save lists with the models
saveRDS(models_list_q1_q2, file = "./processed_data/models_q1_q2.RData")
saveRDS(models_list_q3, file = "./processed_data/models_q3_3_way_interaction.RData")
saveRDS(models_list_nodule_count, file = "./processed_data/models_list_nodule_count.RData")